summaryrefslogtreecommitdiffstats
path: root/app.py
blob: 4fb9add6d0aefcb7aa6dba75d1164f74be662112 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/python3
from sys import argv
import logging
from time import localtime, mktime, time
import requests
from base64 import b64decode
from datetime import datetime, timedelta, timezone
try:
	from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select, DateTime
	from sqlalchemy.orm import declarative_base, relationship, Session
except ModuleNotFoundError:
	raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy")
try:
	from bs4 import BeautifulSoup, FeatureNotFound
except ModuleNotFoundError:
	raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")

operator_contact = argv[2]

Base = declarative_base()

class Book(Base):
	__tablename__ = "books"
	isbn = Column(BigInteger, primary_key=True, nullable=False, doc="book isbn. found in URL http://www/isbn/978 and in acsm: resource, dc:identifier (sometimes not), thumbnailURL")
	title = Column(String, nullable=True, doc="title of the book, dcc:title in acsm")
	creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm")
	publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm")
	identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.")
	thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element")
	format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip")
	language = Column(String, nullable=True, doc="language of the book. I've seen sl.")
	borrows = relationship("Borrow", back_populates="book");
	def __repr__(self):
		return f"Book(isbn={self.isbn!r}, title={self.title!r}, creator={self.creator!r}, publisher={self.publisher!r})"

class Borrow(Base):
	__tablename__ = "borrows"
	id = Column(Integer, primary_key=True, nullable=False, doc="id in transaction element of acsm or in filename of acsm on http")
	isbn = Column(ForeignKey("books.isbn"), nullable=False, doc="foreign key that leads to a book")
	transaction = Column(String, nullable=True, doc="transaction element content, but only if it couldn't be derived from format ACS-BIBL-L-{acsm_id}, otherwise Null")
	purchase_utc = Column(DateTime, nullable=True, doc="acsm purchase element excluding timezone in UTC")
	expiration_utc = Column(DateTime, nullable=True, doc="acsm expiration element excluding timezone in UTC")
	purchase_timezone = Column(Integer, nullable=True, doc="acsm purchase element timezone offset from UTC in seconds (note that purchase is UTC)")
	expiration_timezone = Column(Integer, nullable=True, doc="acsm expiration element timezone offset from UTC in seconds (note that expiration is UTC)")
	obtained = Column(BigInteger, nullable=False, doc="UNIX timestamp when this borrow was obtained as acsm from http")
	duration = Column(Integer, nullable=True, doc="duration in seconds that a DRM client may make the book available")
	book = relationship("Book", back_populates="borrows")
	def __repr__(self):
		return f"Borrow(id={self.id!r}, isbn={self.isbn!r}, purchase={self.purchase_utc!r}, purchase_timezone={self.purchase_timezone!r} expiration={self.expiration_utc!r}, expiration_timezone={self.expiration_timezone!r}, obtained=mktime({localtime(self.obtained)!r}), duration={self.duration!r}, book={self.book!r})"

logging.basicConfig(level=logging.NOTSET)
logger = logging.getLogger(argv[0])
logger.debug("welcome to %s", argv[0])

starting_acsm_id = 177238
guaranteed_large_acsm_id = 1170487

def update(engine, hmfan2iarts=100):
	force_acsm_id = 0
	valid_acsms = 0
	only_isbn_acsms = 0
	failed_acsms = 0
	failed_acsms_not200 = 0
	failed_acsms_not200_in_a_row = 0
	with Session(engine) as session:
		while True:
			if force_acsm_id != 0:
				acsm_id = force_acsm_id
				force_acsm_id = 0
			else:
				borrow = session.scalars(select(Borrow).order_by(Borrow.id.desc()).limit(1)).first()
				acsm_id = starting_acsm_id
				if borrow is None:
					logger.info(f"oooh, it looks like this is a fresh start, db contains no borrows. I'll start with hardcoded acsm id {starting_acsm_id}")
				else:
					logger.info(f"continuing from latest {borrow}")
					acsm_id = borrow.id+1
			r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"})
			r.encoding = "UTF-8"
			if (r.status_code == 200):
				failed_acsms_not200_in_a_row = 0
			if r.status_code != 200:
				if borrow.purchase_utc != None and borrow.purchase_utc > datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(hours=1):
					logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id} and the last requested acsm was created less than an hour ago")
					break
				logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {hmfan2iarts-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.")
				failed_acsms_not200 += 1
				failed_acsms_not200_in_a_row += 1
				force_acsm_id = acsm_id+1
				if failed_acsms_not200_in_a_row == hmfan2iarts:
					logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means {hmfan2iarts} concurrent responses that are not 200.")
					if acsm_id < guaranteed_large_acsm_id:
						logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.")
					break
			elif r.text.startswith("Napaka pri prenosu"):
				logger.warning(f"'napaka pri prenosu' received from http for acsm id {acsm_id}, skipping")
				force_acsm_id = acsm_id+1
			elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_SUCH_RESOURCE resid urn:uuid:00000000-1002-0000-0009-') or r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_DISTRIBUTION_RIGHTS urn:uuid:00000000-1002-0000-0009-'):
				isbn = int([x for x in r.text.split() if x.startswith("urn:uuid:00000000-1002-0000-0009-")][0].split("-").pop())+int(9e12)
				borrow = Borrow(id=acsm_id, isbn=isbn, obtained=int(time()))
				logger.warning(f"received either 'no such resource' or 'no distribution rights' from server and stored a quite empty {borrow}")
				session.add(borrow)
				session.commit()
				only_isbn_acsms += 1
			elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_PARAMETER_SYNTAX rights lrt http://cs.alliance.inkbook.eu:443/fulfillment/URLLink.acsm"/>'):
				logger.warning(f"received urllink parameter syntax error with no usable data for acsm {acsm_id}, so I did not store anything")
				force_acsm_id = acsm_id+1
				if acsm_id >= 199999 and acsm_id <= 999999:
					logger.warning(f"on 2022-11-07, library removed access for acsms 200000-999999. skipping to 1000000")
					force_acsm_id = 1000000
				failed_acsms += 1
			else:
				try:
					acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
				except FeatureNotFound:
					raise FeatureNotFound("pip3 install lxml")
				ft = acsm.fulfillmentToken
				transaction = None
				expected = f"ACS-BIBL-L-{acsm_id}"
				if ft.transaction.string != expected:
					transaction = ft.transaction.string
					logger.info(f"expected {expected} in transaction.string, but instead received {ft.transaction.string} in acsm {acsm_id}")
				isbn = int(ft.resourceItemInfo.resource.string.split("-").pop())+int(9e12)
				identifier_is_isbn = True
				identifier_to_isbn = 0
				identifier = "noidentifier"
				try:
					identifier = ft.resourceItemInfo.metadata.identifier.string
					identifier_to_isbn = int(identifier.split(":").pop().replace("-", ""))
				except (ValueError, AttributeError):
					identifier_is_isbn = False
				if identifier_to_isbn == 0:
					identifier_is_isbn = False
				expected = ft.resourceItemInfo.resource.string
				if ft.licenseToken.resource.string != expected:
					raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}")
				uuid = expected.split(":").pop()
				expected = f"https://cs.alliance.inkbook.eu/books/{uuid}."
				try:
					if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True:
						raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}")
					thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop()
				except AttributeError:
					thumbnail_extension = None
					if ft.resourceItemInfo.metadata.thumbnailURL != None:
						raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}")
				duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string)
				if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string):
					raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}")
				hmac = b64decode(ft.hmac.string, validate=True)
				title = ft.resourceItemInfo.metadata.find(name="dc:title").string
				creator = ft.resourceItemInfo.metadata.creator.string
				publisher = ft.resourceItemInfo.metadata.publisher.string
				language = ft.resourceItemInfo.metadata.language.string
				format = ft.resourceItemInfo.metadata.format.string
				purchase_utc = datetime.strptime(ft.purchase.string, "%Y-%m-%dT%H:%M:%S%z")
				expiration_utc = datetime.strptime(ft.expiration.string, "%Y-%m-%dT%H:%M:%S%z")
				purchase_timezone = purchase_utc.tzinfo.utcoffset(None).seconds
				expiration_timezone = expiration_utc.tzinfo.utcoffset(None).seconds
				purchase_utc = purchase_utc.astimezone(timezone.utc).replace(tzinfo=None)
				expiration_utc = expiration_utc.astimezone(timezone.utc).replace(tzinfo=None)
				if identifier_is_isbn:
					identifier = None
				book = session.get(Book, isbn)
				if book == None:
					book = Book(identifier=identifier, isbn=isbn, title=title, creator=creator, publisher=publisher, thumbnail_extension=thumbnail_extension, language=language, format=format)
				else:
					book.identifier = identifier
					book.isbn = isbn
					book.title = title
					book.creator = creator
					book.publisher = publisher
					book.thumbnail_extension = thumbnail_extension
					book.language = language
					book.format = format
				borrow = Borrow(id=acsm_id, isbn=isbn, purchase_utc=purchase_utc, expiration_utc=expiration_utc, obtained=int(time()), book=book, transaction=transaction, purchase_timezone=purchase_timezone, expiration_timezone=expiration_timezone, duration=duration)
				logger.info(f"found a new {borrow!r}")
				session.add(borrow)
				session.commit()
				valid_acsms += 1
	logger.info(f"In this update, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")
	return {"valid_acsms": valid_acsms, "only_isbn_acsms": only_isbn_acsms, "failed_acsms": failed_acsms, "failed_acsms_not200": failed_acsms_not200, "acsm_id": acsm_id}

if __name__ == "__main__":
	if len(argv) != 1+2:
		raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
	engine = create_engine(argv[1], echo=True, future=True)
	Base.metadata.create_all(engine)
	logger.debug(f"created metadata.")
	try:
		r = update(engine)
	except KeyboardInterrupt:
		logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")